### ---- SCRIPT 03: The purpose of this script is to validate the ideal point estimates of a subset of  ---- ###
### ---- of political and media elites using estimates from a public survey. Where ideal points via     ---- ###   
### ---- social media networks are unavailable for certain parties and organisations, the mean estimate ---- ### 
### ---- of affiliated accounts is used instead.                                                        ---- ###

options(scipen=999)

#### ---- LIBRARIES ---- ####

library(dplyr) # data wrangling
library(ggplot2) # data visualisation
library(data.table) # working with big data
library(devtools) # installation of GGally from Github
library(GGally) # for matrix plotting
library(ggrepel) # prvent overlapping labels in ggplot
library(foreign) # load SPSS files

#### ---- DIRECTORIES ---- ####

raw_dir <- "00-raw_data/"
processed_dir <- "01-processed_data/"
figure_dir <- "02-figures/"

#### ---- LOAD DATA ---- ####

# Guest masterlist with ideal points
guest_data <- fread(paste0(processed_dir,"username_master_list_with_ideal_points.csv"))

# Twitter user data 
user_data <- fread(paste0(processed_dir,"user_ideal_point_data_updated.csv"))

##YouGov survey data 
yougov_data <- fread(paste0(processed_dir,"yougov_summary_data.csv"))

#### ---- IDEAL POINT VALIDATION ---- #### 

# Firstly, merge the YouGov summary data to their ideal points in the Twitter user data by their username.
yougov_data <- user_data %>% 
  select(username,user_ideal_point) %>%
  right_join(yougov_data,by="username")

# Some organisations do not have an ideal point and so ideal points will be ascertained through the mean ideal point of affiliated accounts.
elite_users <- user_data %>% filter(elite_account == "Elite Account")

yougov_data$username_match <- ifelse(is.na(yougov_data$user_ideal_point), paste("@", yougov_data$username, sep = ""), NA)

affiliated_accounts_df <- data.frame()

for (row in 1:nrow(yougov_data)) {
  affiliated_accounts <- elite_users %>% 
    filter(grepl(yougov_data[row,]$username_match,
                 description,
                 ignore.case = TRUE))
  affiliated_accounts$name <- yougov_data[row,]$name
  affiliated_accounts_df <- rbind(affiliated_accounts_df,affiliated_accounts)
}

affiliated_accounts_summary <- affiliated_accounts_df %>%
  group_by(name) %>%
  summarise(count = n(),
            affiliate_mean_ideal_point = mean(user_ideal_point,na.rm = T),
            affiliate_median_ideal_point = median(user_ideal_point,na.rm = T),
            affiliate_ideal_point_sd = sd(user_ideal_point,na.rm = T))

# Merge the organisation affiliation summary data to the original YouGov data
yougov_data <- left_join(yougov_data,affiliated_accounts_summary,by="name")

# Where organisations do not have an ideal point, take the mean affiliate ideal point instead as proxy.
yougov_data$user_ideal_point <- ifelse(is.na(yougov_data$user_ideal_point),
                                       yougov_data$affiliate_mean_ideal_point,yougov_data$user_ideal_point)

yougov_data$user_ideal_point <- round(yougov_data$user_ideal_point,2)

# Test the strength of relationship between the Twitter ideal points and the YouGov survey estimates
cor(yougov_data$user_ideal_point,yougov_data$mean,method = "pearson") # pearson's r = 0.78
summary(lm(user_ideal_point~mean,yougov_data)) # r-sq = 0.61

plot(lm(user_ideal_point~mean,yougov_data))

# Plot the Twitter ideal point estimates against the YouGov mean ideology estimate for each account
# as a scatter plot, labelled with the name, and points shaped by account type
validation_scatterplot <- ggplot(yougov_data, 
                                 aes(x = user_ideal_point, 
                                     y = mean,
                                     label = name, 
                                     shape = type, 
                                     colour = type)) +   # add colour by type
  geom_point() +  
  geom_text_repel(size = 1.7, max.overlaps = 15) +
  geom_smooth(method = "lm", aes(group = 1), colour = "black") +  
  labs(x = "Twitter/X Network Estimate", 
       y = "General Public Estimate",
       shape = "Type",
       colour = "Type")  +
  theme_minimal() +
  theme(legend.position = "top") +
  annotate("text", x = -1, y = 8, 
           label = paste("italic(R^2) ==",0.61), parse = TRUE) +
  scale_colour_viridis_d(option = "D", end = 0.9)

ggsave(paste0(figure_dir,"yougov_validation_scatterplot.png"),
       validation_scatterplot,
       units="in", width=7, height=4, dpi=300,
       bg="white")

#### ---- END ---- ####
